{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c5ca4dc2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import Series, DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f68d8ed5",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>description</th>\n",
       "      <th>province</th>\n",
       "      <th>variety</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>US</td>\n",
       "      <td>This tremendous 100% varietal wine hails from ...</td>\n",
       "      <td>California</td>\n",
       "      <td>Cabernet Sauvignon</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Spain</td>\n",
       "      <td>Ripe aromas of fig, blackberry and cassis are ...</td>\n",
       "      <td>Northern Spain</td>\n",
       "      <td>Tinta de Toro</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>US</td>\n",
       "      <td>Mac Watson honors the memory of a wine once ma...</td>\n",
       "      <td>California</td>\n",
       "      <td>Sauvignon Blanc</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>US</td>\n",
       "      <td>This spent 20 months in 30% new French oak, an...</td>\n",
       "      <td>Oregon</td>\n",
       "      <td>Pinot Noir</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>France</td>\n",
       "      <td>This is the top wine from La Bégude, named aft...</td>\n",
       "      <td>Provence</td>\n",
       "      <td>Provence red blend</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  country                                        description        province  \\\n",
       "0      US  This tremendous 100% varietal wine hails from ...      California   \n",
       "1   Spain  Ripe aromas of fig, blackberry and cassis are ...  Northern Spain   \n",
       "2      US  Mac Watson honors the memory of a wine once ma...      California   \n",
       "3      US  This spent 20 months in 30% new French oak, an...          Oregon   \n",
       "4  France  This is the top wine from La Bégude, named aft...        Provence   \n",
       "\n",
       "              variety  \n",
       "0  Cabernet Sauvignon  \n",
       "1       Tinta de Toro  \n",
       "2     Sauvignon Blanc  \n",
       "3          Pinot Noir  \n",
       "4  Provence red blend  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Open the file `winemag-150k-reviews.csv`, and read it into a data frame\n",
    "filename = '../data/winemag-150k-reviews.csv'\n",
    "\n",
    "df = pd.read_csv(filename,\n",
    "                usecols=['country','province','description', 'variety'])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "80741492",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "description\n",
       "fruit      56327\n",
       "acidity    32536\n",
       "tannins    32098\n",
       "cherry     30639\n",
       "black      24568\n",
       "spice      22601\n",
       "sweet      21243\n",
       "notes      19581\n",
       "fresh      17641\n",
       "berry      17083\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# What are the 10 most common words containing 5 or more letters in the wine descriptions?\n",
    "# Turn all words into lowercase, and remove all punctuation and symbols at the start or end of each word,\n",
    "# for easier comparison.\n",
    "\n",
    "# Also: remove the words flavors, aromas, finish, and drink.\n",
    "\n",
    "def top_10_words(s):\n",
    "    common_wine_words = ['flavors', 'aromas',\n",
    "            'finish', 'drink', 'palate']\n",
    "\n",
    "    words = (\n",
    "        s\n",
    "        .str.lower()\n",
    "        .str.split()\n",
    "        .explode()\n",
    "        .str.strip(',$.?!$%')\n",
    "        )\n",
    "\n",
    "    return (\n",
    "        words\n",
    "        .loc[(words.str.len()>=5) &\n",
    "             (~words.isin(common_wine_words))]\n",
    "        .value_counts()\n",
    "        .head(10)\n",
    "    )\n",
    "\n",
    "\n",
    "\n",
    "# def top_10_words(s):\n",
    "#     words = s.str.lower().str.split().explode().str.strip(',$.?!$%')\n",
    "#     common_wine_words = ['flavors', 'aromas', 'finish', 'drink', 'palate']\n",
    "#     return words[(words.str.len()>=5) & (~words.isin(common_wine_words))].value_counts().head(10)\n",
    "\n",
    "top_10_words(df['description'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e44dfead",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "description\n",
       "fruit      46371\n",
       "acidity    22270\n",
       "tannins    21929\n",
       "cherry     19440\n",
       "spice      18522\n",
       "black      17758\n",
       "notes      16569\n",
       "fresh      16200\n",
       "berry      15478\n",
       "sweet      12708\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# What are the 10 most common words for non-California wines?\n",
    "top_10_words(df.loc[df['province'] != 'California', 'description'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e227625c",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "description\n",
       "fruit        8688\n",
       "acidity      8632\n",
       "tannins      6491\n",
       "fruits       5449\n",
       "fresh        4213\n",
       "character    3494\n",
       "black        3119\n",
       "texture      3069\n",
       "years        2880\n",
       "crisp        2875\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# What are the 10 most common words for French wines?\n",
    "top_10_words(df.loc[df['country'] == 'France', 'description'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1ea8038a",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "description\n",
       "fruit         9133\n",
       "acidity       8346\n",
       "apple         5879\n",
       "citrus        5368\n",
       "crisp         4903\n",
       "chardonnay    4871\n",
       "green         4177\n",
       "notes         4021\n",
       "sweet         3850\n",
       "pineapple     3847\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# What are the 10 most common words for white wines?\n",
    "\n",
    "top_10_words(\n",
    "    df\n",
    "    .loc[df['variety']\n",
    "         .isin(['Chardonnay', \n",
    "                'Sauvignon Blanc', \n",
    "                'Riesling']), \n",
    "    'description']\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "66fbe1f7",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "description\n",
       "fruit         15011\n",
       "cherry        14024\n",
       "tannins       13138\n",
       "black          9535\n",
       "blackberry     6764\n",
       "acidity        6338\n",
       "pinot          6326\n",
       "sweet          5982\n",
       "cherries       5370\n",
       "shows          5337\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# What are the 10 most common words for red wines?\n",
    "top_10_words(\n",
    "    df\n",
    "    .loc[df['variety']\n",
    "         .isin(['Pinot Noir', \n",
    "                'Cabernet Sauvignon', \n",
    "                'Syrah', 'Merlot', \n",
    "                'Zinfandel']),\n",
    "    'description']\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8e8da872",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "description\n",
       "acidity       1135\n",
       "fruit          696\n",
       "crisp          669\n",
       "fresh          622\n",
       "strawberry     534\n",
       "light          514\n",
       "raspberry      509\n",
       "cherry         469\n",
       "fruity         428\n",
       "fruits         419\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# What are the 10 most common words for rosé wines?\n",
    "top_10_words(df\n",
    "             .loc[df['variety'] == 'Rosé', \n",
    "             'description'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6d24bd52",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "description\n",
       "fruit       22784\n",
       "tannins     15968\n",
       "cherry      13974\n",
       "acidity     12496\n",
       "black       11219\n",
       "cabernet     9433\n",
       "spice        7890\n",
       "sweet        7873\n",
       "blend        7563\n",
       "shows        7264\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Show the 10 most common words for the 5 most common wine varieties.\n",
    "\n",
    "top_10_words(df\n",
    "             .loc[df['variety']\n",
    "                  .isin(df['variety']\n",
    "                  .value_counts()\n",
    "                  .head(5)\n",
    "                  .index), \n",
    "             'description'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9871622",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
